%matplotlib inline
# Data preprocessing libraries
import numpy as np
import pandas as pd
from pandas.plotting import parallel_coordinates
import os
import sqlite3
import math
from collections import Counter
from pathlib import Path
from tqdm import tqdm
# Visualization
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
# Model
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Config
mpl.rcParams['font.family'] = 'monospace'
sns.set_theme(style="white", palette=None)
plotly.offline.init_notebook_mode()
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
# Reading csv files and drop the first column
df_train = pd.read_csv(r"data/fraudTrain.csv")
df_train.drop(df_train.columns[0], axis=1, inplace=True)
df_test = pd.read_csv(r"data/fraudTest.csv")
df_test.drop(df_test.columns[0], axis=1, inplace=True)
df = pd.concat([df_train, df_test], axis =0).reset_index()
df.shape
(1852394, 23)
# delete df_train and df_test to save memory
print('Deleting df_train and df_test')
del([df_train, df_test])
Deleting df_train and df_test
df.tail(3)
| index | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1852391 | 555716 | 2020-12-31 23:59:15 | 6011724471098086 | fraud_Rau-Robel | kids_pets | 86.88 | Ann | Lawson | F | 144 Evans Islands Apt. 683 | ... | 46.1966 | -118.9017 | 3684 | Musician | 1981-11-29 | 6c5b7c8add471975aa0fec023b2e8408 | 1388534355 | 46.658340 | -119.715054 | 0 |
| 1852392 | 555717 | 2020-12-31 23:59:24 | 4079773899158 | fraud_Breitenberg LLC | travel | 7.99 | Eric | Preston | M | 7020 Doyle Stream Apt. 951 | ... | 44.6255 | -116.4493 | 129 | Cartographer | 1965-12-15 | 14392d723bb7737606b2700ac791b7aa | 1388534364 | 44.470525 | -117.080888 | 0 |
| 1852393 | 555718 | 2020-12-31 23:59:34 | 4170689372027579 | fraud_Dare-Marvin | entertainment | 38.13 | Samuel | Frey | M | 830 Myers Plaza Apt. 384 | ... | 35.6665 | -97.4798 | 116001 | Media buyer | 1993-05-10 | 1765bb45b3aa3224b4cdcb6e7a96cee3 | 1388534374 | 36.210097 | -97.036372 | 0 |
3 rows × 23 columns
df.columns
Index(['index', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
'merch_lat', 'merch_long', 'is_fraud'],
dtype='object')
df.rename(columns={"trans_date_trans_time":"transaction_time",
"cc_num":"credit_card_number",
"amt":"amount(usd)",
"trans_num":"transaction_id"},
inplace=True)
Transaction_time and dob should be in pd.datetime format
df["transaction_time"] = pd.to_datetime(df["transaction_time"], infer_datetime_format=True)
df["dob"] = pd.to_datetime(df["dob"], infer_datetime_format=True)
from datetime import datetime
# Apply function utcfromtimestamp and drop column unix_time
df['time'] = df['unix_time'].apply(datetime.utcfromtimestamp)
df.drop('unix_time', axis=1)
# Add cloumn hour of day
df['hour_of_day'] = df.time.dt.hour
df[['time','hour_of_day']]
| time | hour_of_day | |
|---|---|---|
| 0 | 2012-01-01 00:00:18 | 0 |
| 1 | 2012-01-01 00:00:44 | 0 |
| 2 | 2012-01-01 00:00:51 | 0 |
| 3 | 2012-01-01 00:01:16 | 0 |
| 4 | 2012-01-01 00:03:06 | 0 |
| ... | ... | ... |
| 1852389 | 2013-12-31 23:59:07 | 23 |
| 1852390 | 2013-12-31 23:59:09 | 23 |
| 1852391 | 2013-12-31 23:59:15 | 23 |
| 1852392 | 2013-12-31 23:59:24 | 23 |
| 1852393 | 2013-12-31 23:59:34 | 23 |
1852394 rows × 2 columns
# Change dtypes
df.credit_card_number = df.credit_card_number.astype('category')
df.is_fraud = df.is_fraud.astype('category')
df.hour_of_day = df.hour_of_day.astype('category')
# Check
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1852394 entries, 0 to 1852393 Data columns (total 25 columns): # Column Dtype --- ------ ----- 0 index int64 1 transaction_time datetime64[ns] 2 credit_card_number category 3 merchant object 4 category object 5 amount(usd) float64 6 first object 7 last object 8 gender object 9 street object 10 city object 11 state object 12 zip int64 13 lat float64 14 long float64 15 city_pop int64 16 job object 17 dob datetime64[ns] 18 transaction_id object 19 unix_time int64 20 merch_lat float64 21 merch_long float64 22 is_fraud category 23 time datetime64[ns] 24 hour_of_day category dtypes: category(3), datetime64[ns](3), float64(5), int64(4), object(10) memory usage: 318.0+ MB
np.round(df.describe(), 2)
| index | amount(usd) | zip | lat | long | city_pop | unix_time | merch_lat | merch_long | |
|---|---|---|---|---|---|---|---|---|---|
| count | 1852394.00 | 1852394.00 | 1852394.00 | 1852394.00 | 1852394.00 | 1852394.00 | 1.852394e+06 | 1852394.00 | 1852394.00 |
| mean | 537193.44 | 70.06 | 48813.26 | 38.54 | -90.23 | 88643.67 | 1.358674e+09 | 38.54 | -90.23 |
| std | 366910.96 | 159.25 | 26881.85 | 5.07 | 13.75 | 301487.62 | 1.819508e+07 | 5.11 | 13.76 |
| min | 0.00 | 1.00 | 1257.00 | 20.03 | -165.67 | 23.00 | 1.325376e+09 | 19.03 | -166.67 |
| 25% | 231549.00 | 9.64 | 26237.00 | 34.67 | -96.80 | 741.00 | 1.343017e+09 | 34.74 | -96.90 |
| 50% | 463098.00 | 47.45 | 48174.00 | 39.35 | -87.48 | 2443.00 | 1.357089e+09 | 39.37 | -87.44 |
| 75% | 833575.75 | 83.10 | 72042.00 | 41.94 | -80.16 | 20328.00 | 1.374581e+09 | 41.96 | -80.25 |
| max | 1296674.00 | 28948.90 | 99921.00 | 66.69 | -67.95 | 2906700.00 | 1.388534e+09 | 67.51 | -66.95 |
groups = [pd.Grouper(key="transaction_time", freq="1W"), "is_fraud"]
df_ = df.groupby(by=groups).agg({"amount(usd)":'mean',"transaction_id":"count"}).reset_index()
def add_traces(df, x, y,hue, mode, cmap, showlegend=None):
name_map = {1:"Yes", 0:"No"}
traces = []
for flag in df[hue].unique():
traces.append(
go.Scatter(
x=df[df[hue]==flag][x],
y=df[df[hue]==flag][y],
mode=mode,
marker=dict(color=cmap[flag]),
showlegend=showlegend,
name=name_map[flag]
)
)
return traces
fig = make_subplots(rows=2, cols=2,
specs=[
[{}, {}],
[{"colspan":2}, None]
],
subplot_titles=("Amount(usd) over time", "Number of transactions overtime",
"Number of transaction by amount(usd)")
)
ntraces = add_traces(df=df_,x='transaction_time',y='amount(usd)',hue='is_fraud',mode='lines',
showlegend=True, cmap=['#61E50F','#D93C1D'])
for trace in ntraces:
fig.add_trace(
trace,
row=1,col=1
)
ntraces = add_traces(df=df_,x='transaction_time',y='transaction_id',hue='is_fraud',mode='lines',
showlegend=False, cmap=['#61E50F','#D93C1D'])
for trace in ntraces:
fig.add_trace(
trace,
row=1,col=2
)
ntraces = add_traces(df=df_,x='transaction_id',y='amount(usd)',hue='is_fraud',mode='markers',
showlegend=True, cmap=['#61E50F','#D93C1D'])
for trace in ntraces:
fig.add_trace(
trace,
row=2,col=1
)
fig.update_layout(height=780,
width=960,
legend=dict(title='Is fraud?'),
plot_bgcolor='#fafafa',
title='Overview'
)
fig.show()
df_ = df.groupby(by=[pd.Grouper(key="transaction_time", freq="1W"),
'is_fraud','category']).agg({"amount(usd)":'mean',"transaction_id":"count"}).reset_index()
fig = px.scatter(df_,
x='transaction_time',
y='amount(usd)',
color='is_fraud',
facet_col ='category',
facet_col_wrap=3,
facet_col_spacing=.04,
color_discrete_map={0:'#61E50F', 1:'#D93C1D'}
)
fig.update_layout(height=1400,
width=960,
legend=dict(title='Is fraud?'),
plot_bgcolor='#fafafa'
)
fig.update_yaxes(matches=None)
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True, title=''))
fig.show();
df_ = df.groupby(by=[pd.Grouper(key="transaction_time", freq="1M"),
'is_fraud','category']).agg({"amount(usd)":'sum',"transaction_id":"count"}).reset_index()
fig = px.area(
df_[df_.is_fraud==1],
x='transaction_time',
y='amount(usd)',
color='category',
color_discrete_sequence=px.colors.qualitative.Dark24
)
fig.update_layout(height=600,
width=960,
legend=dict(title='Categories'),
plot_bgcolor='#fafafa'
)
fig.show();
# Specified list of 12 merchants with the highest number of transactions.
top12_merchants = df.merchant.value_counts()[:12]
df_ = df.groupby(by=[pd.Grouper(key="transaction_time", freq="1W"),'is_fraud',
'merchant']).agg({"amount(usd)":'mean',"transaction_id":"count"}).reset_index()
df_ = df_[df_.merchant.isin(top12_merchants.index)]
fig = px.scatter(df_,
x='transaction_time',
y='amount(usd)',
color='is_fraud',
facet_col ='merchant',
facet_col_wrap=3,
facet_col_spacing=.06,
category_orders={'merchant': top12_merchants.index}, # order the subplots
color_discrete_map={1:'#61E50F', 0:'#D93C1D'}
)
fig.update_layout(height=1200,
width=960,
title='Top 12 merchants with highest number of transactions per week',
legend=dict(title='Is fraud?'),
plot_bgcolor='#fafafa'
)
fig.update_yaxes(matches=None)
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True, title=''))
fig.show();
groups = ['is_fraud','job']
df_ = df.groupby(by=groups).agg({"amount(usd)":'mean',"transaction_id":"count"}).fillna(0).reset_index()
# Top 10 jobs had most fraud transactions.
df_ = df_[df_.is_fraud==1].sort_values(by='transaction_id',
ascending=False).drop_duplicates('job', keep='first').iloc[:10, :]
df_
| is_fraud | job | amount(usd) | transaction_id | |
|---|---|---|---|---|
| 880 | 1 | Quantity surveyor | 611.805652 | 69 |
| 806 | 1 | Naval architect | 650.121970 | 66 |
| 784 | 1 | Materials engineer | 561.092097 | 62 |
| 539 | 1 | Audiological scientist | 662.505172 | 58 |
| 918 | 1 | Senior tax professional/tax inspector | 570.492456 | 57 |
| 977 | 1 | Trading standards officer | 478.137143 | 56 |
| 843 | 1 | Podiatrist | 477.762593 | 54 |
| 691 | 1 | Film/video editor | 528.820577 | 52 |
| 589 | 1 | Colour technologist | 440.824706 | 51 |
| 685 | 1 | Exhibition designer | 524.067255 | 51 |
fig = px.bar(df_,
y='job', x='transaction_id',
color='amount(usd)',
color_continuous_scale=px.colors.sequential.Rainbow,
labels={'job':'Job title',
'transaction_id': 'Number of fraud transactions'},
category_orders = {"job": df_.job.values},
width=960,
height=600)
fig.update_layout(
title=dict(
text='Amount(usd) among top 10 jobs with the most fraud transactions'
),
plot_bgcolor='#fafafa'
)
fig.update_coloraxes(
colorbar=dict(
title='Amount(usd) of transactions',
orientation='h',
x=1
),
reversescale=True
)
fig.show()
groups = ['credit_card_number']
df_ = df.groupby(by=groups).agg({"amount(usd)":'mean',"transaction_id":"count"}).fillna(0).reset_index()
df_.sort_values('transaction_id', ascending=False, inplace=True)
df_ = df_.head(10)
df_
| credit_card_number | amount(usd) | transaction_id | |
|---|---|---|---|
| 185 | 30270432095985 | 56.479135 | 4392 |
| 886 | 6538441737335434 | 76.542413 | 4392 |
| 887 | 6538891242532018 | 87.509667 | 4386 |
| 703 | 4364010865167176 | 47.876443 | 4386 |
| 747 | 4642255475285942 | 59.124403 | 4386 |
| 843 | 6011438889172900 | 91.422839 | 4385 |
| 332 | 344709867813900 | 89.378027 | 4385 |
| 787 | 4904681492230012 | 60.779008 | 4384 |
| 737 | 4586810168620942 | 72.951437 | 4384 |
| 135 | 4745996322265 | 75.752662 | 4384 |
df_ = df[df.is_fraud==1].groupby(by='hour_of_day').agg({'transaction_id':'count'}).reset_index()
fig = px.bar(data_frame=df_,
x='hour_of_day',
y='transaction_id',
labels={'transaction_id':'Number of transaction'})
fig.update_layout(
title=dict(
text='Number of FRAUD transactions by hours of day'
),
plot_bgcolor='#ED2B2A'
)
fig.update_xaxes(type='category')
fig = plt.figure(figsize=(18,9))
mask = np.triu(np.ones_like(df.corr()))
sns.heatmap(df.corr(), mask=mask, cmap='coolwarm', annot=True)
C:\Users\ndaru\AppData\Local\Temp\ipykernel_1380\2504256764.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. C:\Users\ndaru\AppData\Local\Temp\ipykernel_1380\2504256764.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
<Axes: >
features = ['transaction_id', 'hour_of_day', 'category', 'amount(usd)', 'merchant', 'job']
#
X = df[features].set_index("transaction_id")
y = df['is_fraud']
print('X shape:{}\ny shape:{}'.format(X.shape,y.shape))
X shape:(1852394, 5) y shape:(1852394,)
# Encoding categorical data
enc = OrdinalEncoder(dtype=np.int64)
enc.fit(X.loc[:,['category','merchant','job']])
X.loc[:, ['category','merchant','job']] = enc.transform(X[['category','merchant','job']])
C:\Users\ndaru\AppData\Local\Temp\ipykernel_1380\4109020072.py:5: DeprecationWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
X['hour_of_day'] = X['hour_of_day'].astype(int)
X.head()
| hour_of_day | category | amount(usd) | merchant | job | |
|---|---|---|---|---|---|
| transaction_id | |||||
| 0b242abb623afc578575680df30655b9 | 0 | 8 | 4.97 | 514 | 372 |
| 1f76529f8574734946361c461b024d99 | 0 | 4 | 107.23 | 241 | 431 |
| a1a22d70485983eac12b5b88dad1cf95 | 0 | 0 | 220.11 | 390 | 308 |
| 6b849c168bdad6f867558c3793159a81 | 0 | 2 | 45.00 | 360 | 330 |
| a41d7549acf90789359a9aa5346dcb46 | 0 | 9 | 41.96 | 297 | 116 |
X.info()
<class 'pandas.core.frame.DataFrame'> Index: 1852394 entries, 0b242abb623afc578575680df30655b9 to 1765bb45b3aa3224b4cdcb6e7a96cee3 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 hour_of_day int32 1 category int64 2 amount(usd) float64 3 merchant int64 4 job int64 dtypes: float64(1), int32(1), int64(3) memory usage: 77.7+ MB
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))
Training set size: 1481915 Testing set size: 370479
from abod import ABOD
# Create an instance of ABOD model
model = ABOD()
model.fit(X_train)
ABOD(contamination=0.1, method='fast', n_neighbors=5)
# Predict the anomaly scores and plot them
scores = model.decision_scores_
# plt.figure(figsize=(6,5))
# plt.plot(scores)
# plt.title('Training Curve')
# plt.xlabel('Data Point')
# plt.ylabel('Outlier Score')
# plt.show()
# Identify the outliers based on a threshold
threshold_ABOD = np.percentile(scores, 90)
anomalies_ABOD = np.where(scores > threshold_ABOD)[0]
print("Number of anomalies in ABOD:", len(anomalies_ABOD))
print("Indices of anomalies:", anomalies_ABOD)
Number of anomalies in ABOD: 148192 Indices of anomalies: [ 9 14 27 ... 1481879 1481885 1481892]
y_pred = model.predict(X_test)
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}")
Precision: 0.042, Recall: 0.802, F1-score: 0.080
clfr = classification_report(y_test, y_pred)
print("classification report:\n", clfr)
classification report:
precision recall f1-score support
0 1.00 0.90 0.95 368526
1 0.04 0.80 0.08 1953
accuracy 0.90 370479
macro avg 0.52 0.85 0.51 370479
weighted avg 0.99 0.90 0.94 370479
# accuracy of the model
accuracy_ABOD = accuracy_score(y_test, y_pred)
print("Accuracy of the model:", (100 - accuracy_ABOD))
Accuracy of the model: 99.09692317243352
from sklearn.preprocessing import StandardScaler
from keras.layers import Input, Dense
from keras.models import Model
# Normalize the data
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_train)
input_layer = Input(shape=(X_norm.shape[1],))
encoded = Dense(32, activation='relu')(input_layer)
decoded = Dense(X_norm.shape[1], activation='linear')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')
# Train the autoencoder
history = autoencoder.fit(X_norm, X_norm, epochs=100, batch_size=32, validation_data=(X_test, X_test))
# Plot the loss values
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Autoencoder Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
Epoch 1/100 46310/46310 [==============================] - 101s 2ms/step - loss: 0.0027 - val_loss: 798.8865 Epoch 2/100 46310/46310 [==============================] - 101s 2ms/step - loss: 8.2566e-05 - val_loss: 812.2175 Epoch 3/100 46310/46310 [==============================] - 91s 2ms/step - loss: 1.3715e-04 - val_loss: 799.1724 Epoch 4/100 46310/46310 [==============================] - 87s 2ms/step - loss: 7.1967e-05 - val_loss: 785.7149 Epoch 5/100 46310/46310 [==============================] - 93s 2ms/step - loss: 2.5809e-05 - val_loss: 745.5618 Epoch 6/100 46310/46310 [==============================] - 90s 2ms/step - loss: 5.8646e-05 - val_loss: 665.4046 Epoch 7/100 46310/46310 [==============================] - 90s 2ms/step - loss: 2.4789e-05 - val_loss: 683.3556 Epoch 8/100 46310/46310 [==============================] - 86s 2ms/step - loss: 2.0018e-05 - val_loss: 795.3459 Epoch 9/100 46310/46310 [==============================] - 97s 2ms/step - loss: 4.9162e-05 - val_loss: 737.0353 Epoch 10/100 46310/46310 [==============================] - 93s 2ms/step - loss: 3.3235e-05 - val_loss: 597.7823 Epoch 11/100 46310/46310 [==============================] - 98s 2ms/step - loss: 5.2630e-05 - val_loss: 488.4552 Epoch 12/100 46310/46310 [==============================] - 1697s 37ms/step - loss: 1.8584e-05 - val_loss: 474.4684 Epoch 13/100 46310/46310 [==============================] - 95s 2ms/step - loss: 1.8993e-05 - val_loss: 416.3220 Epoch 14/100 46310/46310 [==============================] - 101s 2ms/step - loss: 3.0021e-05 - val_loss: 371.9894 Epoch 15/100 46310/46310 [==============================] - 98s 2ms/step - loss: 2.6584e-05 - val_loss: 387.7877 Epoch 16/100 46310/46310 [==============================] - 101s 2ms/step - loss: 1.3983e-04 - val_loss: 542.9305 Epoch 17/100 46310/46310 [==============================] - 95s 2ms/step - loss: 9.8662e-06 - val_loss: 442.0613 Epoch 18/100 46310/46310 [==============================] - 101s 2ms/step - loss: 1.0518e-05 - val_loss: 483.0312 Epoch 19/100 46310/46310 [==============================] - 100s 2ms/step - loss: 1.3891e-05 - val_loss: 435.2597 Epoch 20/100 46310/46310 [==============================] - 97s 2ms/step - loss: 1.2011e-05 - val_loss: 385.5776 Epoch 21/100 46310/46310 [==============================] - 97s 2ms/step - loss: 1.3489e-05 - val_loss: 346.0983 Epoch 22/100 46310/46310 [==============================] - 87s 2ms/step - loss: 6.6764e-06 - val_loss: 297.7719 Epoch 23/100 46310/46310 [==============================] - 86s 2ms/step - loss: 7.1062e-06 - val_loss: 237.6304 Epoch 24/100 46310/46310 [==============================] - 91s 2ms/step - loss: 6.9782e-06 - val_loss: 226.9504 Epoch 25/100 46310/46310 [==============================] - 87s 2ms/step - loss: 1.0896e-05 - val_loss: 178.5828 Epoch 26/100 46310/46310 [==============================] - 91s 2ms/step - loss: 8.5577e-06 - val_loss: 115.1383 Epoch 27/100 46310/46310 [==============================] - 87s 2ms/step - loss: 5.4364e-06 - val_loss: 81.3829 Epoch 28/100 46310/46310 [==============================] - 99s 2ms/step - loss: 1.4315e-05 - val_loss: 67.3545 Epoch 29/100 46310/46310 [==============================] - 108s 2ms/step - loss: 6.7536e-06 - val_loss: 70.9670 Epoch 30/100 46310/46310 [==============================] - 168s 4ms/step - loss: 1.0562e-05 - val_loss: 67.4418 Epoch 31/100 46310/46310 [==============================] - 105s 2ms/step - loss: 6.4250e-06 - val_loss: 67.4423 Epoch 32/100 46310/46310 [==============================] - 100s 2ms/step - loss: 1.3594e-05 - val_loss: 67.4837 Epoch 33/100 46310/46310 [==============================] - 108s 2ms/step - loss: 6.0162e-06 - val_loss: 67.2489 Epoch 34/100 46310/46310 [==============================] - 108s 2ms/step - loss: 7.6263e-06 - val_loss: 67.4615 Epoch 35/100 46310/46310 [==============================] - 101s 2ms/step - loss: 7.7855e-06 - val_loss: 73.2368 Epoch 36/100 46310/46310 [==============================] - 106s 2ms/step - loss: 5.8810e-06 - val_loss: 67.7864 Epoch 37/100 46310/46310 [==============================] - 100s 2ms/step - loss: 9.4593e-06 - val_loss: 67.4481 Epoch 38/100 46310/46310 [==============================] - 107s 2ms/step - loss: 6.6285e-06 - val_loss: 65.3129 Epoch 39/100 46310/46310 [==============================] - 100s 2ms/step - loss: 3.3031e-06 - val_loss: 67.4063 Epoch 40/100 46310/46310 [==============================] - 106s 2ms/step - loss: 3.6143e-06 - val_loss: 67.9348 Epoch 41/100 46310/46310 [==============================] - 101s 2ms/step - loss: 8.0979e-06 - val_loss: 69.3459 Epoch 42/100 46310/46310 [==============================] - 110s 2ms/step - loss: 6.5660e-06 - val_loss: 67.2906 Epoch 43/100 46310/46310 [==============================] - 100s 2ms/step - loss: 1.0220e-05 - val_loss: 67.5379 Epoch 44/100 46310/46310 [==============================] - 105s 2ms/step - loss: 5.8966e-06 - val_loss: 67.3803 Epoch 45/100 46310/46310 [==============================] - 104s 2ms/step - loss: 3.5860e-06 - val_loss: 66.1019 Epoch 46/100 46310/46310 [==============================] - 100s 2ms/step - loss: 5.3923e-06 - val_loss: 67.4551 Epoch 47/100 46310/46310 [==============================] - 113s 2ms/step - loss: 6.4108e-06 - val_loss: 67.4115 Epoch 48/100 46310/46310 [==============================] - 100s 2ms/step - loss: 4.6064e-06 - val_loss: 70.1752 Epoch 49/100 46310/46310 [==============================] - 105s 2ms/step - loss: 5.0708e-06 - val_loss: 79.5460 Epoch 50/100 46310/46310 [==============================] - 101s 2ms/step - loss: 5.4105e-06 - val_loss: 75.7583 Epoch 51/100 46310/46310 [==============================] - 107s 2ms/step - loss: 3.7249e-06 - val_loss: 64.4269 Epoch 52/100 46310/46310 [==============================] - 100s 2ms/step - loss: 3.1741e-06 - val_loss: 70.8249 Epoch 53/100 46310/46310 [==============================] - 103s 2ms/step - loss: 1.4301e-05 - val_loss: 73.1375 Epoch 54/100 46310/46310 [==============================] - 98s 2ms/step - loss: 1.0320e-05 - val_loss: 67.3938 Epoch 55/100 46310/46310 [==============================] - 105s 2ms/step - loss: 3.0510e-06 - val_loss: 67.5344 Epoch 56/100 46310/46310 [==============================] - 98s 2ms/step - loss: 4.2315e-06 - val_loss: 66.9729 Epoch 57/100 46310/46310 [==============================] - 103s 2ms/step - loss: 3.6442e-06 - val_loss: 67.4425 Epoch 58/100 46310/46310 [==============================] - 102s 2ms/step - loss: 4.3597e-06 - val_loss: 67.4446 Epoch 59/100 46310/46310 [==============================] - 99s 2ms/step - loss: 1.7322e-05 - val_loss: 76.4163 Epoch 60/100 46310/46310 [==============================] - 104s 2ms/step - loss: 5.4840e-06 - val_loss: 67.4960 Epoch 61/100 46310/46310 [==============================] - 97s 2ms/step - loss: 6.6607e-06 - val_loss: 69.4176 Epoch 62/100 46310/46310 [==============================] - 103s 2ms/step - loss: 5.0113e-06 - val_loss: 69.9497 Epoch 63/100 46310/46310 [==============================] - 98s 2ms/step - loss: 7.2803e-06 - val_loss: 67.7120 Epoch 64/100 46310/46310 [==============================] - 103s 2ms/step - loss: 4.8563e-06 - val_loss: 67.4497 Epoch 65/100 46310/46310 [==============================] - 100s 2ms/step - loss: 2.9979e-06 - val_loss: 67.0462 Epoch 66/100 46310/46310 [==============================] - 104s 2ms/step - loss: 4.0548e-06 - val_loss: 67.6371 Epoch 67/100 46310/46310 [==============================] - 98s 2ms/step - loss: 5.9152e-06 - val_loss: 67.4406 Epoch 68/100 46310/46310 [==============================] - 106s 2ms/step - loss: 5.5672e-06 - val_loss: 67.4671 Epoch 69/100 46310/46310 [==============================] - 99s 2ms/step - loss: 3.8862e-06 - val_loss: 66.8664 Epoch 70/100 46310/46310 [==============================] - 104s 2ms/step - loss: 6.1208e-06 - val_loss: 67.0503 Epoch 71/100 46310/46310 [==============================] - 98s 2ms/step - loss: 7.0417e-06 - val_loss: 68.0381 Epoch 72/100 46310/46310 [==============================] - 105s 2ms/step - loss: 4.3939e-06 - val_loss: 67.5628 Epoch 73/100 46310/46310 [==============================] - 100s 2ms/step - loss: 3.7445e-06 - val_loss: 68.0660 Epoch 74/100 46310/46310 [==============================] - 95s 2ms/step - loss: 7.0234e-06 - val_loss: 67.4700 Epoch 75/100 46310/46310 [==============================] - 100s 2ms/step - loss: 5.9367e-06 - val_loss: 70.2799 Epoch 76/100 46310/46310 [==============================] - 93s 2ms/step - loss: 3.0918e-06 - val_loss: 75.9240 Epoch 77/100 46310/46310 [==============================] - 101s 2ms/step - loss: 4.1638e-06 - val_loss: 80.6550 Epoch 78/100 46310/46310 [==============================] - 96s 2ms/step - loss: 4.1391e-06 - val_loss: 86.4364 Epoch 79/100 46310/46310 [==============================] - 100s 2ms/step - loss: 4.6143e-06 - val_loss: 69.2294 Epoch 80/100 46310/46310 [==============================] - 95s 2ms/step - loss: 7.4677e-06 - val_loss: 67.4877 Epoch 81/100 46310/46310 [==============================] - 100s 2ms/step - loss: 1.2753e-05 - val_loss: 68.0223 Epoch 82/100 46310/46310 [==============================] - 94s 2ms/step - loss: 4.6475e-06 - val_loss: 67.3934 Epoch 83/100 46310/46310 [==============================] - 107s 2ms/step - loss: 4.8149e-06 - val_loss: 68.9184 Epoch 84/100 46310/46310 [==============================] - 96s 2ms/step - loss: 4.4906e-06 - val_loss: 69.5349 Epoch 85/100 46310/46310 [==============================] - 100s 2ms/step - loss: 8.2413e-06 - val_loss: 86.2081 Epoch 86/100 46310/46310 [==============================] - 96s 2ms/step - loss: 7.1900e-06 - val_loss: 76.4586 Epoch 87/100 46310/46310 [==============================] - 102s 2ms/step - loss: 5.0467e-06 - val_loss: 95.7874 Epoch 88/100 46310/46310 [==============================] - 95s 2ms/step - loss: 5.5555e-06 - val_loss: 86.1176 Epoch 89/100 46310/46310 [==============================] - 101s 2ms/step - loss: 4.8585e-06 - val_loss: 68.7187 Epoch 90/100 46310/46310 [==============================] - 97s 2ms/step - loss: 4.9291e-06 - val_loss: 71.0593 Epoch 91/100 46310/46310 [==============================] - 101s 2ms/step - loss: 3.6411e-06 - val_loss: 72.2039 Epoch 92/100 46310/46310 [==============================] - 97s 2ms/step - loss: 6.3930e-06 - val_loss: 67.8203 Epoch 93/100 46310/46310 [==============================] - 100s 2ms/step - loss: 5.2665e-06 - val_loss: 67.4419 Epoch 94/100 46310/46310 [==============================] - 95s 2ms/step - loss: 4.1132e-06 - val_loss: 62.6383 Epoch 95/100 46310/46310 [==============================] - 101s 2ms/step - loss: 6.5242e-06 - val_loss: 68.2449 Epoch 96/100 46310/46310 [==============================] - 97s 2ms/step - loss: 4.3387e-06 - val_loss: 67.4390 Epoch 97/100 46310/46310 [==============================] - 100s 2ms/step - loss: 5.6844e-06 - val_loss: 68.6650 Epoch 98/100 46310/46310 [==============================] - 96s 2ms/step - loss: 4.1366e-06 - val_loss: 67.5588 Epoch 99/100 46310/46310 [==============================] - 109s 2ms/step - loss: 4.8993e-06 - val_loss: 67.7765 Epoch 100/100 46310/46310 [==============================] - 104s 2ms/step - loss: 4.1623e-06 - val_loss: 67.3544
# Predict the reconstructed X_test
X_test_pred = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - X_test_pred, 2), axis=1)
# threshold to classify as fraud or not fraud
threshold = np.mean(mse) + 3*np.std(mse)
# Create the predicted labels based on the threshold
y_pred = np.where(mse > threshold, 1, 0)
11578/11578 [==============================] - 16s 1ms/step
plt.figure(figsize=(10, 7))
plt.plot(X_norm[0], label='Original Data')
plt.plot(X_test_pred[0], label='Reconstructed Data')
plt.legend()
plt.title('Original vs Reconstructed Data')
plt.show()
# Create the confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
# Print the confusion matrix and classification report
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)
C:\Users\ndaru\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Confusion Matrix:
[[368526 0]
[ 1953 0]]
Classification Report:
precision recall f1-score support
0 0.99 1.00 1.00 368526
1 0.00 0.00 0.00 1953
accuracy 0.99 370479
macro avg 0.50 0.50 0.50 370479
weighted avg 0.99 0.99 0.99 370479
C:\Users\ndaru\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. C:\Users\ndaru\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
# Plot the confusion matrix
plt.figure(figsize=(6,5))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.colorbar()
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks([0,1])
plt.yticks([0,1])
plt.show()
# Evaluate the performance of the autoencoder
mse = autoencoder.evaluate(X_norm, X_norm, verbose=0)
rmse = np.sqrt(mse)
print("MSE: {:.8f}".format(mse))
print("RMSE: {:.8f}".format(rmse))
MSE: 0.00000003 RMSE: 0.00016254
# Calculate the accuracy
accuracy_AE = accuracy_score(y_test, y_pred)
print("Fraud detection accuracy (AE): {:.2f}%".format(accuracy_AE * 100))
Fraud detection accuracy (AE): 99.47%